This is the same notebook as 100_species, but we look at the rottnest queryset. This should be slightly harder. # Setup

library(tidyverse)
library(forcats)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggupset)
library(RColorBrewer)
library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(pheatmap)
library(broom)
source('code/helpers.R')
data <- targets::tar_read(merged_all_results)
# rename BLAST to BLAST97 to differentiate from BLAST100 (percentage identity in both cases)
data <- data |> mutate(Type = str_replace(Type, '^BLAST$', 'BLAST97'))
truth <- targets::tar_read(truth_set_data)
table(data$Type)
## 
##    BLAST100     BLAST97  Kraken_0.0 Kraken_0.05  Kraken_0.1    Metabuli 
##        9025       13893       82560       82560       55040       32545 
##     MMSeqs2      Mothur         NBC      Qiime2     VSEARCH 
##       89856       87048       98280       13229       13331

Let’s remove the >0.2 Kraken runs, those are too strict

data <- data |> filter(!Type %in% c('Kraken_0.2', 'Kraken_0.3', 'Kraken_0.4', 'Kraken_0.5', 'Kraken_0.6', 'Kraken_0.7', 'Kraken_0.8', 'Kraken_0.9'))

Made a mistake- Metabuli’s Database is misspelled

data <- data |> mutate(Subject = str_replace_all(Subject, pattern = '_ref.fasta', replacement=''))
table(data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                 132865 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                  54092 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                  54092 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                  52001 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                  52001 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                  44774 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                  45205 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                  13384 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                  14233 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                  55763 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                  58957

Rottnest species

Check 12S results

table(data$Subject)
## 
##                                                    12s_v010_final.fasta 
##                                                                    8327 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    7830 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    7687 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    7783 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    7849 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    7723 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    7854 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    7574 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    7628 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    7973 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    7930 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    8112 
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    7938 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    7926 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    8113 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    7981 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    7967 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    7866 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    7997 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    7896 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    8009 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    7361 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    7370 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    7584 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    7474 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    7582 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    7565 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    7308 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    7389 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    7343 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    7569 
##                                                    12S_v10_HmmCut.fasta 
##                                                                    5800 
##                                                     16S_v04_final.fasta 
##                                                                    9457 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    8244 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    8018 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    8478 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    8477 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    8446 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    8668 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    8193 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    8644 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    8078 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    8232 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    8897 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    8983 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    8838 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    8622 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    8831 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    9114 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    8716 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    9044 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    9003 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    9011 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    7764 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    8018 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    7965 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    7888 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    8314 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    7815 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    7641 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    7938 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    7817 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    8011 
##                                                    16S_v04_HmmCut.fasta 
##                                                                    6430 
##                                                     c01_v03_final.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    1720 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    1720 
##                                                    c01_v03_HmmCut.fasta 
##                                                                    1720
table(data$Subject)
## 
##                                                    12s_v010_final.fasta 
##                                                                    8327 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    7830 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    7687 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    7783 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    7849 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    7723 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    7854 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    7574 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    7628 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    7973 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    7930 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    8112 
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    7938 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    7926 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    8113 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    7981 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    7967 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    7866 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    7997 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    7896 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    8009 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    7361 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    7370 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    7584 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    7474 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    7582 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    7565 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    7308 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    7389 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    7343 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    7569 
##                                                    12S_v10_HmmCut.fasta 
##                                                                    5800 
##                                                     16S_v04_final.fasta 
##                                                                    9457 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    8244 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    8018 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    8478 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    8477 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    8446 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    8668 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    8193 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    8644 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    8078 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    8232 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    8897 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    8983 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    8838 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    8622 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    8831 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    9114 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    8716 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    9044 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    9003 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    9011 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    7764 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    8018 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    7965 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    7888 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    8314 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    7815 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    7641 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    7938 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    7817 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    8011 
##                                                    16S_v04_HmmCut.fasta 
##                                                                    6430 
##                                                     c01_v03_final.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    3124 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    1720 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    1720 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                    3124 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    1720 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    1720 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    1720 
##                                                    c01_v03_HmmCut.fasta 
##                                                                    1720
twelveS_data <- data |> filter(Subject == '12s_v010_final.fasta')
sixteenS_data <- data |> filter(Subject == '16S_v04_final.fasta')
table(twelveS_data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                   1827 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    947 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    947 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    645 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    645 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                    594 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                    600 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                    229 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                    184 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                    977 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                    732
table(sixteenS_data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                   2766 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    720 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    720 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    922 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    922 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                    594 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                    600 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                    178 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                    236 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                    738 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                   1061
table(sixteenS_data$Subject)
## 
## 16S_v04_final.fasta 
##                9457
twelveS_data_vs_12S_100 <- twelveS_data |> 
  filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa')
sixteenS_data_vs_16S_100 <- sixteenS_data |> 
  filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa' )
twelveS_data_vs_12S_100 |> select(Type, species) |> filter(species != 'dropped' &
                                                             !is.na(species)) |>
  group_by(Type) |> count(species) |> summarise(n = n()) |>
  ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() + 
  theme_minimal() + 
  ylab('Count') + 
  ggtitle('12S: Species-level hits per classifier')

twelveS_data_vs_12S_100 |> select(Type, genus) |> filter(genus != 'dropped' &
                                                             !is.na(genus)) |>
  group_by(Type) |> count(genus) |> summarise(n = n()) |>
  ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() + 
  theme_minimal() + 
  ylab('Count') + 
  ggtitle('12S: Genus-level hits per classifier')

twelveS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(twelveS_truth)
## # A tibble: 6 × 3
##   True_OTU True_family  True_species            
##   <chr>    <chr>        <chr>                   
## 1 ASV_1    Syngnathidae Phyllopteryx taeniolatus
## 2 ASV_2    Syngnathidae Phycodurus eques        
## 3 ASV_3    Syngnathidae Phyllopteryx taeniolatus
## 4 ASV_4    Labridae     dropped                 
## 5 ASV_5    Alopiidae    Isurus oxyrinchus       
## 6 ASV_6    Carangidae   Seriola lalandi
twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  mutate(Correct = True_species == species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  group_by(Type) |> count(Correct) |> 
  ggplot(aes(x = fct_rev(fct_reorder2(Type, Correct, n)), fill = Correct, y = n))+ geom_col() +
  coord_flip() + theme_minimal() + xlab('Type') + 
  ggtitle('12S: Correct and incorrect species-level classifications (absolute)') +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73",
          "#F0E442", "#0072B2", "#D55E00", "#CC79A7"))

cols <- c('Correct species' = "#009E73", 'Correct genus'="#56B4E9", 'Correct family' = "#0072B2", 'Incorrect family' = "#E69F00", 'Incorrect genus'="#F0E442", 'Incorrect species'="#D55E00", 'No hit'= "#CC79A7")
twelve_s_relative_plot <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |>
  count(CorrectSpecies) |> 
  mutate(total = sum(n, na.rm=TRUE)) |> 
  mutate(missing = 102 - total) |> 
  group_modify(~ add_row(.x)) |> 
  group_modify(~ {
    .x |> mutate(new_col= max(missing, na.rm=TRUE)) |> 
      mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
                                  TRUE ~ n)) |> 
      select(-new_col)
  } ) |> 
  mutate(total = 102) |> 
  mutate(perc = n / total * 100) |> 
  mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |> 
  mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |> 
  ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+ 
  geom_col() +
  coord_flip() + 
  theme_minimal() + 
  ylab('Percentage') + xlab('Type') +
  ggtitle('12S: Correct and incorrect species-level classifications (relative)') +
  scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 11 rows [3, 95, 210, 275,
## 325, 436, 516, 573, 675, 777, 879].
twelve_s_relative_plot

### Calculate Upset-based species sightings

type_list <- twelveS_data_vs_12S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

a <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared species') +
  ylab('Species')
a
## Warning: Removed 46 rows containing non-finite values (`stat_count()`).

type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species == True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

b <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared correct species') +
  ylab('Species')
b
## Warning: Removed 25 rows containing non-finite values (`stat_count()`).

type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species != True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

c <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared incorrect species') +
  ylab('Species')
c
## Warning: Removed 9 rows containing non-finite values (`stat_count()`).

a + b + c & ylim(c(0, 30)) & 
  theme(
  # Hide panel borders and remove grid lines
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.background = element_blank(),
  panel.grid.minor = element_blank(),
  #panel.grid.major.y = element_line(),
  # Change axis line
  axis.line = element_line(colour = "black")
  )

Calculate FP/TP/TN/FN

add_scores <- function(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth ) {
  twelveS_data_vs_12S_100_with_MaxTruth|> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 102 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums))
}
scores <- add_scores(twelveS_data_vs_12S_100, twelveS_truth)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 11 rows [3, 95, 210, 275,
## 325, 436, 516, 573, 675, 777, 879].
scores <- scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
                              f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 11 × 10
## # Rowwise: 
##    Type           TP    FP    TN    FN recall precision    f1  f0.5 accuracy
##    <chr>       <int> <int> <int> <dbl>  <dbl>     <dbl> <dbl> <dbl>    <dbl>
##  1 BLAST100       57     2     0    43  0.57      0.966 0.717 0.848    0.559
##  2 BLAST97        59     8     0    35  0.628     0.881 0.733 0.815    0.578
##  3 Kraken_0.0     60    12     0    30  0.667     0.833 0.741 0.794    0.588
##  4 Kraken_0.05    57     9     0    36  0.613     0.864 0.717 0.798    0.559
##  5 Kraken_0.1     53     5     0    44  0.546     0.914 0.684 0.805    0.520
##  6 MMSeqs2        64     7     0    31  0.674     0.901 0.771 0.844    0.627
##  7 Metabuli       40     6     0    56  0.417     0.870 0.563 0.714    0.392
##  8 Mothur         42    22     0    38  0.525     0.656 0.583 0.625    0.412
##  9 NBC            55    18     0    29  0.655     0.753 0.701 0.731    0.539
## 10 Qiime2         31    18     0    53  0.369     0.633 0.466 0.554    0.304
## 11 VSEARCH        52    17     0    33  0.612     0.754 0.675 0.720    0.510
twelveS_scoreS_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('12S scores')
twelveS_scoreS_plot

Scores heatmap

Let’s also make a heatmap from that

b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, recall, precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

Metaclassifier

table(twelveS_data_vs_12S_100$Type)
## 
##    BLAST100     BLAST97  Kraken_0.0 Kraken_0.05  Kraken_0.1    Metabuli 
##          79          93         102         102         102          75 
##     MMSeqs2      Mothur         NBC      Qiime2     VSEARCH 
##         102         102         102          49          69

First, we count the per-OTU species hits

twelveS_data_vs_12S_100_maxCount <- twelveS_data_vs_12S_100 |>  
  mutate(species = na_if(species, 'dropped')) |> 
  filter(!is.na(species)) |> 
  #filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Qiime2', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) |> 
  group_by(Query, Subject, OTU) |> 
  count(species) |> 
  # double check the truth
  #left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |> 
  #mutate(Truth = True_species == species) |> 
  # pull out the per-group highest n
  filter( n > 4) |> 
  slice_max(n, n=1, with_ties = FALSE) |> 
  mutate(Type = 'MaxCount', .before = 'Query') |> 
  select(-n)
twelveS_data_vs_12S_100_maxCount
## # A tibble: 70 × 5
## # Groups:   Query, Subject, OTU [70]
##    Type     Query                                          Subject OTU   species
##    <chr>    <chr>                                          <chr>   <chr> <chr>  
##  1 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_1 Phyllo…
##  2 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Carcha…
##  3 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Isurus…
##  4 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Acanth…
##  5 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Pseudo…
##  6 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Parupe…
##  7 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Sphyrn…
##  8 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Parupe…
##  9 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Chaeto…
## 10 MaxCount make_12s_16s_simulated_reads_8-Rottnest_runED… 12s_v0… ASV_… Dascyl…
## # ℹ 60 more rows
twelveS_data_vs_12S_100_with_MaxTruth <- twelveS_data_vs_12S_100 |> 
  bind_rows(twelveS_data_vs_12S_100_maxCount) #|>  
  #filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Qiime2', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) 
maxTruth_scores <-  add_scores(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth )
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 11 rows [3, 95, 210, 275,
## 325, 436, 516, 573, 675, 777, 879].
maxTruth_scores <- maxTruth_scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
                              f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
maxTruth_scoreS_plot <- maxTruth_scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + geom_point() + ggtitle('12S scores')
maxTruth_scoreS_plot

With the Rottnest data MaxCount performs slightly better! But still not as good as MMSeqs2….

16S

sixteenS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
tail(sixteenS_truth)
## # A tibble: 6 × 3
##   True_OTU True_family    True_species            
##   <chr>    <chr>          <chr>                   
## 1 ASV_106  Alopiidae      Isurus oxyrinchus       
## 2 ASV_107  Carangidae     Seriola dumerili        
## 3 ASV_108  Carcharhinidae Sphyrna lewini          
## 4 ASV_109  Scombridae     Auxis thazard           
## 5 ASV_110  Syngnathidae   Phycodurus eques        
## 6 ASV_111  Syngnathidae   Phyllopteryx taeniolatus
sixteenS_relative_plot <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
 separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |> 
  count(CorrectSpecies) |> 
  mutate(total = sum(n)) |> 
  mutate(missing = 112 - total) |> 
  group_modify(~ add_row(.x)) |> 
  group_modify(~ {
    .x |> mutate(new_col= max(missing, na.rm=TRUE)) |> 
      mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
                                  TRUE ~ n)) |> 
      select(-new_col)
  } ) |> 
  mutate(total = 112) |> 
  mutate(perc = n / total * 100) |> 
  mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |> 
  mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |> 
  tidyr::complete(CorrectSpecies, fill = list(n=0, total = 112, missing = NA, perc = 0)) |>  
  ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+ 
  geom_col() +
  coord_flip() + 
  theme_minimal() + 
  ylab('Percentage') + xlab('Type') +
  ggtitle('16S: Correct and incorrect species-level classifications (relative)') +
  scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 20 rows [7, 9, 104, 105,
## 215, 217, 287, 363, 365, 441, 510, 520, 620, 622, 732, 734, 844, 846, 956,
## 958].
sixteenS_relative_plot 

Calculate scores

scores <- add_scores(sixteenS_data_vs_16S_100, sixteenS_truth)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 20 rows [7, 9, 104, 105,
## 215, 217, 287, 363, 365, 441, 510, 520, 620, 622, 732, 734, 844, 846, 956,
## 958].
scores
## # A tibble: 11 × 5
##    Type           TP    FP    TN    FN
##    <chr>       <int> <int> <int> <dbl>
##  1 BLAST100       53     3     0    46
##  2 BLAST97        62     6     2    32
##  3 Kraken_0.0     70    16     2    14
##  4 Kraken_0.05    69    11     2    20
##  5 Kraken_0.1     66     8     2    26
##  6 MMSeqs2        69     8     2    23
##  7 Metabuli       40     2     1    59
##  8 Mothur         61    27     2    12
##  9 NBC            56    20     2    24
## 10 Qiime2         56    20     1    25
## 11 VSEARCH        68    11     1    22
scores <- scores |> rowwise() |> mutate(recall = recall(TP, FN), precision = precision(TP, FP),
                              f1 = f1(precision, recall), f0.5 = f0.5(precision, recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 11 × 10
## # Rowwise: 
##    Type           TP    FP    TN    FN recall precision    f1  f0.5 accuracy
##    <chr>       <int> <int> <int> <dbl>  <dbl>     <dbl> <dbl> <dbl>    <dbl>
##  1 BLAST100       53     3     0    46  0.535     0.946 0.684 0.820    0.520
##  2 BLAST97        62     6     2    32  0.660     0.912 0.765 0.847    0.627
##  3 Kraken_0.0     70    16     2    14  0.833     0.814 0.824 0.818    0.706
##  4 Kraken_0.05    69    11     2    20  0.775     0.862 0.817 0.844    0.696
##  5 Kraken_0.1     66     8     2    26  0.717     0.892 0.795 0.851    0.667
##  6 MMSeqs2        69     8     2    23  0.75      0.896 0.817 0.862    0.696
##  7 Metabuli       40     2     1    59  0.404     0.952 0.567 0.749    0.402
##  8 Mothur         61    27     2    12  0.836     0.693 0.758 0.718    0.618
##  9 NBC            56    20     2    24  0.7       0.737 0.718 0.729    0.569
## 10 Qiime2         56    20     1    25  0.691     0.737 0.713 0.727    0.559
## 11 VSEARCH        68    11     1    22  0.756     0.861 0.805 0.837    0.676
sixteenS_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') +  ggtitle('16S scores')
sixteenS_score_plot 

b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, recall, precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

### Calculate Upset-based species sightings

type_list <- sixteenS_data_vs_16S_100  |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

a <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared species') +
  ylab('Species')
a
## Warning: Removed 34 rows containing non-finite values (`stat_count()`).

type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth,  by = c('OTU' = 'True_OTU')) |>
  filter(species == True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

b <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared correct species') +
  ylab('Species')
b
## Warning: Removed 17 rows containing non-finite values (`stat_count()`).

type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species != True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

c <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared incorrect species') +
  ylab('Species')
c
## Warning: Removed 11 rows containing non-finite values (`stat_count()`).

a + b + c & ylim(c(0, 20)) & 
  theme(
  # Hide panel borders and remove grid lines
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.background = element_blank(),
  panel.grid.minor = element_blank(),
  #panel.grid.major.y = element_line(),
  # Change axis line
  axis.line = element_line(colour = "black")
  )

16S/12S relative plots

sixteenS_relative_plot / twelve_s_relative_plot

Let’s make without titles, but with a/b

(sixteenS_relative_plot + ggtitle('') + ylab(''))/ (twelve_s_relative_plot + ggtitle('')) + 
  plot_annotation(tag_levels = c('A','B')) +
  plot_layout(guides = 'collect')

(sixteenS_score_plot +geom_point() + theme(axis.title.x = element_blank()))/ (twelveS_scoreS_plot + geom_point())

12S exclusion databases

twelve_exclusions <- data |> filter(str_starts(Subject, '12s_v010_final.fasta_Taxonomies.CountedFams.txt_')) |> 
  filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa')
table(twelve_exclusions$Subject)
## 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                     840 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                     844 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                     860 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                     837 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                     811 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                     845 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                     819 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                     827 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                     875 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                     852 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                     888 
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                     865 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                     857 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                     882 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                     889 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                     919 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                     851 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                     920 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                     856 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                     873 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                     775 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                     758 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                     757 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                     730 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                     725 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                     775 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                     727 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                     742 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                     754 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                     804
twelve_exclusions_split <- twelve_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |> 
  # get rid of leftover non-subsetted databases
  filter(!is.na(hit)) |> 
  separate(hit, into=c('Database', 'after'), sep='_subset')
twelve_exclusions_split_averaged <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 102 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  group_by(Type, Database) |> 
  summarise(mean_TP = mean(TP),
            mean_FP = mean(FP),
            mean_TN = mean(TN),
            mean_FN = mean(FN)) |> 
  rowwise() |> 
  mutate(recall = recall(mean_TP, mean_FN), 
         precision = precision(mean_TP, mean_FP),
         f1 = f1(precision, recall),
         f0.5 = f0.5(precision, recall),
         accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN)) 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 272 rows [1, 55, 107,
## 164, 253, 304, 396, 572, 689, 752, 817, 945, 1021, 1081, 1168, 1256, 1353,
## 1395, 1441, 1488, ...].
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
twelve_exclusions_split_averaged <- twelve_exclusions_split_averaged |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database))
f1_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
f0.5_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
precision_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = precision, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
recall_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = recall, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
(f1_pl / f0.5_pl / precision_pl / recall_pl) +  plot_layout(guides = 'collect')

Lets zero in on the precision and make boxplots with jitter dots

un_summarised_twelve <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 102 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  rowwise() |> 
  mutate(recall = recall(TP, FN), 
         precision = precision(TP, FP),
         f1 = f1(precision, recall),
         f0.5 = f0.5(precision, recall),
         accuracy = accuracy(TP, FP, FN, TN)) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 272 rows [1, 55, 107,
## 164, 253, 304, 396, 572, 689, 752, 817, 945, 1021, 1081, 1168, 1256, 1353,
## 1395, 1441, 1488, ...].
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(precision))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database),  color=Database, y = precision)) +
  geom_boxplot(outlier.shape = NA) +
  coord_flip() +
  theme_minimal() +
  xlab('Type') +
  ylab('Precision') +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> group_by(Type, Database) |> mutate(best = max(mean(f0.5))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> group_by(Type, Database) |> mutate(best = max(mean(recall))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = recall)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Qiime2')) |> 
  ggplot(aes(x=Database, y = precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot() +
  labs(fill='Type') + 
  ylab('Precision') + 
  theme_minimal()

false_positives <- un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = FP/102*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('False positives (%)') + 
  geom_point(aes(color=Type), 
             position = position_jitterdodge(jitter.width = 0.2), 
             alpha=0.5,
             show.legend = FALSE)+
  theme_minimal()
false_positives

true_positives <- un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = TP/102*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('True positives (%)') + 
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)+
  theme_minimal()
true_positives

false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()

### Phylogenetic diversity

We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.

spec_summarised <- twelve_exclusions_split |> 
  group_by(Type, Query, Database, after) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database)) |> 
  filter(!is.na(species)) |> 
  summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

Let’s also do not all of the classifiers

spec_summarised |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

a <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  group_by(Database) |> 
  arrange(Database) |> 
  group_map(~aov(`Alpha diversity index` ~ Type, data=.))

names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  4247.086  1315.200
## Deg. of Freedom        6        63
## 
## Residual standard error: 4.569047
## Estimated effects may be unbalanced
## 
## $`50%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  3679.743   836.200
## Deg. of Freedom        6        63
## 
## Residual standard error: 3.643215
## Estimated effects may be unbalanced
## 
## $`70%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  2587.771   999.600
## Deg. of Freedom        6        63
## 
## Residual standard error: 3.983298
## Estimated effects may be unbalanced
library(agricolae)
## Registered S3 methods overwritten by 'klaR':
##   method      from 
##   predict.rda vegan
##   print.rda   vegan
##   plot.rda    vegan
groupslist <- list()

for(key in names(a)) {
  print(key)
  groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|> 
    as_tibble(rownames = 'Type') |> 
    select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 4) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none")

16S exclusion databases

sixteen_exclusions <- data |> filter(str_starts(Subject, '16S_v04_final.fasta_Taxonomies.')) |> 
  filter(Query == 'make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa')
table(sixteen_exclusions$Subject)
## 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                    873 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                    861 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                    951 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                    870 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                    871 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                    880 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                    904 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                    933 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                    843 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                    867 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                    979 
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                    992 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                    955 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                    963 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                    906 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   1014 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                    936 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                    960 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                    991 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                    993 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                    796 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                    824 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                    814 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                    790 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                    869 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                    806 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                    745 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                    856 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                    785 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                    805
sixteen_exclusions_split <- sixteen_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |> 
  # get rid of leftover non-subsetted databases
  filter(!is.na(hit)) |> 
  separate(hit, into=c('Database', 'after'), sep='_subset')
sixteen_exclusions_split_averaged <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 102 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  group_by(Type, Database) |> 
  summarise(mean_TP = mean(TP),
            mean_FP = mean(FP),
            mean_TN = mean(TN),
            mean_FN = mean(FN)) |> 
  rowwise() |> 
  mutate(recall = recall(mean_TP, mean_FN), 
         precision = precision(mean_TP, mean_FP),
         f1 = f1(precision, recall),
         f0.5 = f0.5(precision, recall),
         accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN)) 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 484 rows [3, 165, 210,
## 256, 349, 406, 489, 629, 630, 701, 765, 821, 823, 902, 967, 1030, 1106, 1180,
## 1212, 1262, ...].
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
sixteen_exclusions_split_averaged <- sixteen_exclusions_split_averaged |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database))
f1_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
f0.5_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
precision_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = precision, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
recall_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = recall, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
(f1_pl / f0.5_pl / precision_pl / recall_pl) +  plot_layout(guides = 'collect')

Lets zero in on the precision and make boxplots with jitter dots

un_summarised_sixteen <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 112 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  rowwise() |> 
  mutate(recall = recall(TP, FN), 
         precision = precision(TP, FP),
         f1 = f1(precision, recall),
         f0.5 = f0.5(precision, recall),
         accuracy = accuracy(TP, FP, FN, TN)) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 484 rows [3, 165, 210,
## 256, 349, 406, 489, 629, 630, 701, 765, 821, 823, 902, 967, 1030, 1106, 1180,
## 1212, 1262, ...].
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(precision))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database),  color=Database, y = precision)) +
  geom_boxplot(outlier.shape = NA) +
  coord_flip() +
  theme_minimal() +
  xlab('Type') +
  ylab('Precision') +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_sixteen  |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)
## Warning: Removed 10 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).

un_summarised_sixteen  |> group_by(Type, Database) |> mutate(best = max(mean(recall))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = recall)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Kraken_0.05', 'Qiime2')) |> 
  ggplot(aes(x=Database, y = precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot() +
  labs(fill='Type') + 
  ylab('Precision') + 
  theme_minimal()

false_positives <- un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = FP/112*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('False positives (%)') + 
  geom_point(aes(color=Type), 
             position = position_jitterdodge(jitter.width = 0.2), 
             alpha=0.5,
             show.legend = FALSE)+
  theme_minimal()
false_positives

true_positives <- un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Kraken_0.05', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = TP/112*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('True positives (%)') + 
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)+
  theme_minimal()
true_positives

false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()

### Phylogenetic diversity

We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.

spec_summarised <- sixteen_exclusions_split |> 
  group_by(Type, Query, Database, after) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '70%',
                               Database == 'thirty' ~ '30%',
                               TRUE ~ Database)) |> 
  filter(!is.na(species)) |> 
  summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

Let’s also do not all of the classifiers

spec_summarised |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Kraken_0.05',  'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

a <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0',  'Kraken_0.05', 'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  group_by(Database) |> 
  arrange(Database) |> 
  group_map(~aov(`Alpha diversity index` ~ Type, data=.))

names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  8580.307  2011.575
## Deg. of Freedom        6        61
## 
## Residual standard error: 5.742529
## Estimated effects may be unbalanced
## 
## $`50%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  7157.943  3359.900
## Deg. of Freedom        6        63
## 
## Residual standard error: 7.302859
## Estimated effects may be unbalanced
## 
## $`70%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  4334.886  1085.700
## Deg. of Freedom        6        63
## 
## Residual standard error: 4.151305
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()

for(key in names(a)) {
  print(key)
  groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|> 
    as_tibble(rownames = 'Type') |> 
    select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Kraken_0.05',  'Kraken_0.1','MMSeqs2', 'Qiime2')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 4) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none")